imports

import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, accuracy_score

import networkx as nx
# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]

fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	2019-01-01 00:00:00	2.703190e+15	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315	0
1	2019-01-01 00:00:00	6.304230e+11	fraud_Heller, Gutmann and Zieme	grocery_pos	107.23	Stephanie	Gill	F	43039 Riley Greens Suite 393	Orient	...	48.8878	-118.2105	149	Special educational needs teacher	1978-06-21	1f76529f8574734946361c461b024d99	1325376044	49.159047	-118.186462	0
2	2019-01-01 00:00:00	3.885950e+13	fraud_Lind-Buckridge	entertainment	220.11	Edward	Sanchez	M	594 White Dale Suite 530	Malad City	...	42.1808	-112.2620	4154	Nature conservation officer	1962-01-19	a1a22d70485983eac12b5b88dad1cf95	1325376051	43.150704	-112.154481	0
3	2019-01-01 00:01:00	3.534090e+15	fraud_Kutch, Hermiston and Farrell	gas_transport	45.00	Jeremy	White	M	9443 Cynthia Court Apt. 038	Boulder	...	46.2306	-112.1138	1939	Patent attorney	1967-01-12	6b849c168bdad6f867558c3793159a81	1325376076	47.034331	-112.561071	0
4	2019-01-01 00:03:00	3.755340e+14	fraud_Keeling-Crist	misc_pos	41.96	Tyler	Garcia	M	408 Bradley Rest	Doe Hill	...	38.4207	-79.4629	99	Dance movement psychotherapist	1986-03-28	a41d7549acf90789359a9aa5346dcb46	1325376186	38.674999	-78.632459	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1048570	2020-03-10 16:07:00	6.011980e+15	fraud_Fadel Inc	health_fitness	77.00	Haley	Wagner	F	05561 Farrell Crescent	Annapolis	...	39.0305	-76.5515	92106	Accountant, chartered certified	1943-05-28	45ecd198c65e81e597db22e8d2ef7361	1362931649	38.779464	-76.317042	0
1048571	2020-03-10 16:07:00	4.839040e+15	fraud_Cremin, Hamill and Reichel	misc_pos	116.94	Meredith	Campbell	F	043 Hanson Turnpike	Hedrick	...	41.1826	-92.3097	1583	Geochemist	1999-06-28	c00ce51c6ebb7657474a77b9e0b51f34	1362931670	41.400318	-92.726724	0
1048572	2020-03-10 16:08:00	5.718440e+11	fraud_O'Connell, Botsford and Hand	home	21.27	Susan	Mills	F	005 Cody Estates	Louisville	...	38.2507	-85.7476	736284	Engineering geologist	1952-04-02	17c9dc8b2a6449ca2473726346e58e6c	1362931711	37.293339	-84.798122	0
1048573	2020-03-10 16:08:00	4.646850e+18	fraud_Thompson-Gleason	health_fitness	9.52	Julia	Bell	F	576 House Crossroad	West Sayville	...	40.7320	-73.1000	4056	Film/video editor	1990-06-25	5ca650881b48a6a38754f841c23b77ab	1362931718	39.773077	-72.213209	0
1048574	2020-03-10 16:08:00	2.283740e+15	fraud_Buckridge PLC	misc_pos	6.81	Shannon	Williams	F	9345 Spencer Junctions Suite 183	Alpharetta	...	34.0770	-84.3033	165556	Prison officer	1997-12-27	8d0a575fe635bbde12f1a2bffc126731	1362931730	33.601468	-83.891921	0

1048575 rows × 22 columns

def throw(df, percentage):  # 사기 거래 비율에 맞춰 버려지는 함수!
    df1 = df[df['is_fraud'] == 1].copy()
    df0 = df[df['is_fraud'] == 0].copy()
    df0_downsample = (len(df1) * (1-percentage)) / (len(df0) * percentage)
    df0_down = df0.sample(frac=df0_downsample, random_state=42)
    df_p = pd.concat([df1, df0_down])
    return df_p

df = throw(fraudTrain, 0.5)

autogluon

A. 데이터

def bipartite(df, node_1, node_2, graph_type=nx.Graph()):
    df=df.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df[node_1].values.tolist()+\
                                                      df[node_2].values.tolist()))}
    
    df["from"]=df[node_1].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df[node_2].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")   
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") 

    return G




G  = bipartite(df, node_1 = 'cc_num', node_2 = 'merchant')



def G_split(G, test_size):
    train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G.edges))), 
                                                                      list(nx.get_edge_attributes(G, "label").values()), 
                                                                      test_size=test_size, 
                                                                      random_state=42)
    edgs = list(G.edges)
    
    train_graph = G.edge_subgraph([edgs[x] for x in train_edges]).copy()
    train_graph.add_nodes_from(list(set(G.nodes) - set(train_graph.nodes)))
    test_graph = G.edge_subgraph([edgs[x] for x in test_edges]).copy()
    test_graph.add_nodes_from(list(set(G.nodes) - set(test_graph.nodes)))
    
    return train_graph, test_graph

train_graph, test_graph = G_split(G, test_size=0.2)

    train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G.edges))), 
                                                                      list(nx.get_edge_attributes(G, "label").values()), 
                                                                      test_size=0.2, 
                                                                      random_state=42)

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
edgs = list(G.edges)
node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 
    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00,  2.59it/s]

tr = TabularDataset(train_embeddings)

B. predictor 생성

predictr = TabularPredictor(train_labels)

No path specified. Models will be saved in: "AutogluonModels/ag-20240117_122745/"

C.적합(fit)

predictr.fit(tr, presets='best_quality')

Presets specified: ['best_quality']

AttributeError: 'TabularDataset' object has no attribute 'unique'

predictr.leaderboard()

                      model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.894772       0.020896   4.365760                0.009703           2.119462            2       True         14
1           CatBoost_BAG_L1   0.894661       0.004997   1.386780                0.004997           1.386780            1       True          7
2            XGBoost_BAG_L1   0.894439       0.025061   0.600274                0.025061           0.600274            1       True         11
3      LightGBMLarge_BAG_L1   0.894106       0.006197   0.859518                0.006197           0.859518            1       True         13
4           LightGBM_BAG_L1   0.893995       0.015738   0.650386                0.015738           0.650386            1       True          4
5     NeuralNetTorch_BAG_L1   0.888778       0.050014  14.929281                0.050014          14.929281            1       True         12
6         LightGBMXT_BAG_L1   0.885004       0.030494   0.456313                0.030494           0.456313            1       True          3
7     KNeighborsUnif_BAG_L1   0.878233       0.011929   0.005328                0.011929           0.005328            1       True          1
8    NeuralNetFastAI_BAG_L1   0.867022       0.089430   7.351443                0.089430           7.351443            1       True         10
9     KNeighborsDist_BAG_L1   0.864136       0.009754   0.004292                0.009754           0.004292            1       True          2
10    ExtraTreesEntr_BAG_L1   0.862582       0.211025   0.299140                0.211025           0.299140            1       True          9
11    ExtraTreesGini_BAG_L1   0.862249       0.203468   0.341149                0.203468           0.341149            1       True          8
12  RandomForestEntr_BAG_L1   0.856033       0.185369   0.526263                0.185369           0.526263            1       True          6
13  RandomForestGini_BAG_L1   0.856033       0.190420   0.333284                0.190420           0.333284            1       True          5

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	WeightedEnsemble_L2	0.894772	0.020896	4.365760	0.009703	2.119462	2	True	14
1	CatBoost_BAG_L1	0.894661	0.004997	1.386780	0.004997	1.386780	1	True	7
2	XGBoost_BAG_L1	0.894439	0.025061	0.600274	0.025061	0.600274	1	True	11
3	LightGBMLarge_BAG_L1	0.894106	0.006197	0.859518	0.006197	0.859518	1	True	13
4	LightGBM_BAG_L1	0.893995	0.015738	0.650386	0.015738	0.650386	1	True	4
5	NeuralNetTorch_BAG_L1	0.888778	0.050014	14.929281	0.050014	14.929281	1	True	12
6	LightGBMXT_BAG_L1	0.885004	0.030494	0.456313	0.030494	0.456313	1	True	3
7	KNeighborsUnif_BAG_L1	0.878233	0.011929	0.005328	0.011929	0.005328	1	True	1
8	NeuralNetFastAI_BAG_L1	0.867022	0.089430	7.351443	0.089430	7.351443	1	True	10
9	KNeighborsDist_BAG_L1	0.864136	0.009754	0.004292	0.009754	0.004292	1	True	2
10	ExtraTreesEntr_BAG_L1	0.862582	0.211025	0.299140	0.211025	0.299140	1	True	9
11	ExtraTreesGini_BAG_L1	0.862249	0.203468	0.341149	0.203468	0.341149	1	True	8
12	RandomForestEntr_BAG_L1	0.856033	0.185369	0.526263	0.185369	0.526263	1	True	6
13	RandomForestGini_BAG_L1	0.856033	0.190420	0.333284	0.190420	0.333284	1	True	5

D. 예측(predict)

(tr.is_fraud == predictr.predict(tr)).mean()

0.8967698967698968

(tst.is_fraud == predictr.predict(tst)).mean()

0.9021908791725435